import os
from sklearn.utils import shuffle
import cv2
import numpy as np
import h5py

def new_select_drivers(image_narrays, lable_narrays, drivers_id, drivers_list):
    data = []
    target = []
    index = []
    new_drivers_id = []
    for i in range(len(drivers_id)):
        if drivers_id[i] in drivers_list:
            data.append(image_narrays[i])
            target.append(lable_narrays[i])
            new_drivers_id.append(drivers_id[i])
            index.append(i)
    return data, target, new_drivers_id, index


def get_drivers():
    dr = dict()
    path = '/media/dell/delldisk/dell/wxm/Data/KaggleDDD/driver_imgs_list.csv'
    print('Read drivers data')
    f = open(path, 'r')
    while True:
        line = f.readline()
        if line == '':
            break
        arr = line.strip().split(',')
        dr[arr[2]] = arr[0]
    f.close()
    return dr


def resample_by_sortedID(data, target, new_drivers_id):
    s = []
    for i in range(len(data)):
        s.append((data[i], target[i], new_drivers_id[i]))
    ss = sorted(s, key=lambda e: e[2])
    for j in range(len(ss)):
        data[j] = ss[j][0]
        target[j] = ss[j][1]
        new_drivers_id[j] = ss[j][2]
    return data, target, new_drivers_id


def load_imglabel():
    drivers_all = get_drivers()
    train_folder_path = '/media/dell/delldisk/dell/wxm/Data/KaggleDDD/train'
    labels = []
    img_paths = []
    drivers_id = []  # drivers in train fold
    for sub_folder in os.listdir(train_folder_path):
        train_path_c = os.path.join(train_folder_path, sub_folder)
        label = int(sub_folder[-1])
        for img in os.listdir(train_path_c):
            drivers_id.append(drivers_all[img])
            img_paths.append(os.path.join(sub_folder, img))
            labels.append(label)
    unique_drivers = sorted(list(set(drivers_id)))
    return img_paths, labels, drivers_id, unique_drivers

img_paths, labels, drivers_id, unique_drivers = load_imglabel()
img_paths, labels, drivers_id_, index = new_select_drivers(img_paths, labels, drivers_id, unique_drivers)
img_paths, labels, drivers_id = shuffle(img_paths, labels, drivers_id_)
img_paths, labels, drivers_id = resample_by_sortedID(img_paths, labels, drivers_id)

id = 'p002'
img_arrs = []
label_vecs = []
for i in range(len(drivers_id)):
    current_id = drivers_id[i]
    print current_id
    if current_id == id:
        img_arrs.append(cv2.imread('/media/dell/delldisk/dell/wxm/Data/KaggleDDD/train/' + img_paths[i]))
        label_vecs.append(labels[i])
    elif current_id != id:
        img_arrs = np.asarray(img_arrs, np.uint8)
        label_vecs = np.asarray(label_vecs, np.uint8)
        f = h5py.File('/media/dell/delldisk/dell/wxm/Data/KaggleDDD/h5/' + id + '.h5')
        f.create_dataset('img_arrs', data=img_arrs)
        f.create_dataset('labels', data=label_vecs)
        f.close()
        id = current_id
        img_arrs = []
        label_vecs = []


